#include "stdafx.h"

/*
 * -- SuperLU routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 */
/*
 * File name:		smyblas2.c
 * Purpose:
 *     Level 2 BLAS operations: solves and matvec, written in C.
 * Note:
 *     This is only used when the system lacks an efficient BLAS library.
 */

/*
 * Solves a dense UNIT lower triangular system. The unit lower 
 * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). 
 * The solution will be returned in the rhs vector.
 */


#include "hnum_pssp_defs.h"


namespace harlinn
{
    namespace numerics
    {
        namespace SuperLU
        {
            void slsolve ( int ldm, int ncol, float *M, float *rhs )
            {
                int k;
                float x0, x1, x2, x3, x4, x5, x6, x7;
                float *M0;
                register float *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
                register int firstcol = 0;

                M0 = &M[0];

                while ( firstcol < ncol - 7 ) { /* Do 8 columns */
                    Mki0 = M0 + 1;
                    Mki1 = Mki0 + ldm + 1;
                    Mki2 = Mki1 + ldm + 1;
                    Mki3 = Mki2 + ldm + 1;
                    Mki4 = Mki3 + ldm + 1;
                    Mki5 = Mki4 + ldm + 1;
                    Mki6 = Mki5 + ldm + 1;
                    Mki7 = Mki6 + ldm + 1;

                    x0 = rhs[firstcol];
                    x1 = rhs[firstcol+1] - x0 * *Mki0++;
                    x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++;
                    x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++;
                    x4 = rhs[firstcol+4] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
	                                - x3 * *Mki3++;
                    x5 = rhs[firstcol+5] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
	                                - x3 * *Mki3++ - x4 * *Mki4++;
                    x6 = rhs[firstcol+6] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
	                                - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++;
                    x7 = rhs[firstcol+7] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
	                                - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++
			                - x6 * *Mki6++;

                    rhs[++firstcol] = x1;
                    rhs[++firstcol] = x2;
                    rhs[++firstcol] = x3;
                    rhs[++firstcol] = x4;
                    rhs[++firstcol] = x5;
                    rhs[++firstcol] = x6;
                    rhs[++firstcol] = x7;
                    ++firstcol;
    
                    for (k = firstcol; k < ncol; k++)
	            rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++
	                            - x2 * *Mki2++ - x3 * *Mki3++
                                    - x4 * *Mki4++ - x5 * *Mki5++
			            - x6 * *Mki6++ - x7 * *Mki7++;
 
                    M0 += 8 * ldm + 8;
                }

                while ( firstcol < ncol - 3 ) { /* Do 4 columns */
                    Mki0 = M0 + 1;
                    Mki1 = Mki0 + ldm + 1;
                    Mki2 = Mki1 + ldm + 1;
                    Mki3 = Mki2 + ldm + 1;

                    x0 = rhs[firstcol];
                    x1 = rhs[firstcol+1] - x0 * *Mki0++;
                    x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++;
                    x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++;

                    rhs[++firstcol] = x1;
                    rhs[++firstcol] = x2;
                    rhs[++firstcol] = x3;
                    ++firstcol;
    
                    for (k = firstcol; k < ncol; k++)
	            rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++
	                            - x2 * *Mki2++ - x3 * *Mki3++;
 
                    M0 += 4 * ldm + 4;
                }

                if ( firstcol < ncol - 1 ) { /* Do 2 columns */
                    Mki0 = M0 + 1;
                    Mki1 = Mki0 + ldm + 1;

                    x0 = rhs[firstcol];
                    x1 = rhs[firstcol+1] - x0 * *Mki0++;

                    rhs[++firstcol] = x1;
                    ++firstcol;
    
                    for (k = firstcol; k < ncol; k++)
	            rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++;
 
                }
    
            }

            /*
                * Solves a dense upper triangular system. The upper triangular matrix is
                * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned
                * in the rhs vector.
                */
            void susolve ( int ldm, int ncol, float *M, float *rhs )
            {
                float xj;
                int jcol, j, irow;

                jcol = ncol - 1;

                for (j = 0; j < ncol; j++) {

	            xj = rhs[jcol] / M[jcol + jcol*ldm]; 		/* M(jcol, jcol) */
	            rhs[jcol] = xj;
	
	            for (irow = 0; irow < jcol; irow++)
	                rhs[irow] -= xj * M[irow + jcol*ldm];	/* M(irow, jcol) */

	            jcol--;

                }
            }


            /*
                * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec.
                * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[].
                */
            void smatvec (
            int ldm,	/* in -- leading dimension of M */
            int nrow,	/* in */ 
            int ncol,	/* in */
            float *M,	/* in */
            float *vec,	/* in */
            float *Mxvec	/* in/out */
            )
            {
                float vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7;
                float *M0;
                register float *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
                register int firstcol = 0;
                int k;

                M0 = &M[0];
                while ( firstcol < ncol - 7 ) {	/* Do 8 columns */

	            Mki0 = M0;
	            Mki1 = Mki0 + ldm;
                    Mki2 = Mki1 + ldm;
                    Mki3 = Mki2 + ldm;
	            Mki4 = Mki3 + ldm;
	            Mki5 = Mki4 + ldm;
	            Mki6 = Mki5 + ldm;
	            Mki7 = Mki6 + ldm;

	            vi0 = vec[firstcol++];
	            vi1 = vec[firstcol++];
	            vi2 = vec[firstcol++];
	            vi3 = vec[firstcol++];	
	            vi4 = vec[firstcol++];
	            vi5 = vec[firstcol++];
	            vi6 = vec[firstcol++];
	            vi7 = vec[firstcol++];	

	            for (k = 0; k < nrow; k++) 
	                Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++
		                    + vi2 * *Mki2++ + vi3 * *Mki3++ 
		                    + vi4 * *Mki4++ + vi5 * *Mki5++
		                    + vi6 * *Mki6++ + vi7 * *Mki7++;

	            M0 += 8 * ldm;
                }

                while ( firstcol < ncol - 3 ) {	/* Do 4 columns */

	            Mki0 = M0;
	            Mki1 = Mki0 + ldm;
	            Mki2 = Mki1 + ldm;
	            Mki3 = Mki2 + ldm;

	            vi0 = vec[firstcol++];
	            vi1 = vec[firstcol++];
	            vi2 = vec[firstcol++];
	            vi3 = vec[firstcol++];	
	            for (k = 0; k < nrow; k++) 
	                Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++
		                    + vi2 * *Mki2++ + vi3 * *Mki3++ ;

	            M0 += 4 * ldm;
                }

                while ( firstcol < ncol ) {		/* Do 1 column */

 	            Mki0 = M0;
	            vi0 = vec[firstcol++];
	            for (k = 0; k < nrow; k++)
	                Mxvec[k] += vi0 * *Mki0++;

	            M0 += ldm;
                }
	
            }

            /*
                * Performs dense matrix-vector multiply with 2 vectors:
                *        y0 = y0 + A * x0
                *        y1 = y1 + A * x1
                */
            void smatvec2 (
                            int lda,     /* leading dimension of A */
                            int m,
                            int n,
                            float *A,   /* in - size m-by-n */
                            float *x0,  /* in - size n-by-1 */
                            float *x1,  /* in - size n-by-1 */
                            float *y0,  /* out - size n-by-1 */
                            float *y1   /* out - size n-by-1 */
                            )

            {
                register float v00, v10, v20, v30, v40, v50, v60, v70,
                                v01, v11, v21, v31, v41, v51, v61, v71;
                register float t0, t1, t2, t3, t4, t5, t6, t7;
                register float f0, f1;
                float *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
                register int firstcol = 0;
                float *M0;
                int k;

                M0 = &A[0];

                while ( firstcol < n - 7 ) {        /* Do 8 columns */

                    Mki0 = M0;
                    Mki1 = Mki0 + lda;
                    Mki2 = Mki1 + lda;
                    Mki3 = Mki2 + lda;
                    Mki4 = Mki3 + lda;
                    Mki5 = Mki4 + lda;
                    Mki6 = Mki5 + lda;
                    Mki7 = Mki6 + lda;

                    v00 = x0[firstcol];   v01 = x1[firstcol++];
                    v10 = x0[firstcol];   v11 = x1[firstcol++];
                    v20 = x0[firstcol];   v21 = x1[firstcol++];
                    v30 = x0[firstcol];   v31 = x1[firstcol++];
                    v40 = x0[firstcol];   v41 = x1[firstcol++];
                    v50 = x0[firstcol];   v51 = x1[firstcol++];
                    v60 = x0[firstcol];   v61 = x1[firstcol++];
                    v70 = x0[firstcol];   v71 = x1[firstcol++];

                    for (k = 0; k < m; k++) {
                        f0 = y0[k];
                        f1 = y1[k];
                        t0 = Mki0[k];  f0 += v00 * t0;  f1 += v01 * t0;
                        t1 = Mki1[k];  f0 += v10 * t1;  f1 += v11 * t1;
                        t2 = Mki2[k];  f0 += v20 * t2;  f1 += v21 * t2;
                        t3 = Mki3[k];  f0 += v30 * t3;  f1 += v31 * t3;
                        t4 = Mki4[k];  f0 += v40 * t4;  f1 += v41 * t4;
                        t5 = Mki5[k];  f0 += v50 * t5;  f1 += v51 * t5;
                        t6 = Mki6[k];  f0 += v60 * t6;  f1 += v61 * t6;
                        t7 = Mki7[k];  f0 += v70 * t7;  f1 += v71 * t7;
                        y0[k] = f0;
                        y1[k] = f1;
                    }

                    M0 += 8 * lda;
                }

                while ( firstcol < n - 3 ) {        /* Do 4 columns */
                    Mki0 = M0;
                    Mki1 = Mki0 + lda;
                    Mki2 = Mki1 + lda;
                    Mki3 = Mki2 + lda;

                    v00 = x0[firstcol];   v01 = x1[firstcol++];
                    v10 = x0[firstcol];   v11 = x1[firstcol++];
                    v20 = x0[firstcol];   v21 = x1[firstcol++];
                    v30 = x0[firstcol];   v31 = x1[firstcol++];

                    for (k = 0; k < m; k++) {
                        f0 = y0[k];
                        f1 = y1[k];
                        t0 = Mki0[k];  f0 += v00 * t0;  f1 += v01 * t0;
                        t1 = Mki1[k];  f0 += v10 * t1;  f1 += v11 * t1;
                        t2 = Mki2[k];  f0 += v20 * t2;  f1 += v21 * t2;
                        t3 = Mki3[k];  f0 += v30 * t3;  f1 += v31 * t3;
                        y0[k] = f0;
                        y1[k] = f1;
                    }

                    M0 += 4 * lda;

                }

                while ( firstcol < n ) {            /* Do 1 column */
                    Mki0 = M0;
                    v00 = x0[firstcol];   v01 = x1[firstcol++];

                    for (k = 0; k < m; k++) {
                        f0 = y0[k];
                        f1 = y1[k];
                        t0 = Mki0[k];
                        f0 += v00 * t0;
                        f1 += v01 * t0;
                        y0[k] = f0;
                        y1[k] = f1;
                    }

                    M0 += lda;
                }

            }

        };
    };
};
